library(fmsb)
library(Hmisc)
library(data.table)
require(plyr)
library(readr)

## function to extract the domain
domain <- function(x) strsplit(gsub("http://|https://|www\\.", "", x), "/")[[c(1, 1)]]

# load user domain dataset
users_domains <- readRDS("./users_domains_feb.rds")
users_domains$username <- tolower(users_domains$username)

df_pol_prox <- readRDS("./user_proximity_gen.rds")

# get latest MINE2018 source analysis
mine_source_analysis <- readRDS("./source_analysis_feb.rds")

domains_list <- unique(users_domains$domain)
domains_list <- droplevels(domains_list)

domain_pol_prox <- data.frame(domain=domains_list,Civica_popolare=0,FdI=0,FI=0,Insieme=0,LeU=0,LN=0,M5S=0,PD=0,Piu_Europa=0,PP=0)

for (i in 1:nrow(domain_pol_prox)){
  temp <- users_domains[users_domains$domain == domain_pol_prox$domain[i],]
  
  if (length(unique(temp$username)) > 2) {
    df_temp <- df_pol_prox[df_pol_prox$username %in% temp$username,]
    domain_pol_prox[i,2:11] <- apply(df_temp[1:10],2,mean)
    rm(df_temp)
  }
  rm(temp)
}

# remove domains without enough data and with less than 2 urls
domain_pol_prox <- subset(domain_pol_prox, rowSums(domain_pol_prox[,2:11])>0) 
domain_pol_prox <- join(domain_pol_prox,mine_source_analysis,by="domain")

domain_pol_prox <- subset(domain_pol_prox, domain_pol_prox$urls>1) 


write.csv(domain_pol_prox, file = "./mediasources_mpas.csv", row.names = F)
saveRDS(object = domain_pol_prox,file = "./domain_political_proxmity_def.RDS")
